# Importing necessary packages
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
data = pd.read_excel('nba_player_data.xlsx')
data.sample(10)
| Year | Season_Type | PLAYER_ID | RANK | PLAYER | TEAM_ID | TEAM | GP | MIN | FGM | FGA | FG_PCT | FG3M | FG3A | FG3_PCT | FTM | FTA | FT_PCT | OREB | DREB | REB | AST | STL | BLK | TOV | PF | PTS | EFF | AST_TOV | STL_TOV | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-19 | Regular%20Season | 201935 | 1 | James Harden | 1610612745 | HOU | 78 | 2867 | 843 | 1909 | 0.442 | 378 | 1028 | 0.368 | 754 | 858 | 0.879 | 66 | 452 | 518 | 586 | 158 | 58 | 387 | 244 | 2818 | 2581 | 1.51 | 0.41 |
| 1 | 2018-19 | Regular%20Season | 202331 | 2 | Paul George | 1610612760 | OKC | 77 | 2841 | 707 | 1614 | 0.438 | 292 | 757 | 0.386 | 453 | 540 | 0.839 | 105 | 523 | 628 | 318 | 170 | 34 | 205 | 214 | 2159 | 2110 | 1.55 | 0.83 |
| 2 | 2018-19 | Regular%20Season | 202689 | 3 | Kemba Walker | 1610612766 | CHA | 82 | 2863 | 731 | 1684 | 0.434 | 260 | 731 | 0.356 | 380 | 450 | 0.844 | 52 | 309 | 361 | 484 | 102 | 34 | 211 | 131 | 2102 | 1849 | 2.29 | 0.48 |
| 3 | 2018-19 | Regular%20Season | 203078 | 4 | Bradley Beal | 1610612764 | WAS | 82 | 3028 | 764 | 1609 | 0.475 | 209 | 596 | 0.351 | 362 | 448 | 0.808 | 89 | 322 | 411 | 448 | 121 | 58 | 224 | 226 | 2099 | 1982 | 2.00 | 0.54 |
| 4 | 2018-19 | Regular%20Season | 203081 | 5 | Damian Lillard | 1610612757 | POR | 80 | 2838 | 681 | 1533 | 0.444 | 237 | 643 | 0.369 | 468 | 513 | 0.912 | 68 | 303 | 371 | 551 | 88 | 34 | 212 | 148 | 2067 | 2002 | 2.60 | 0.41 |
# Check if dataset has any null values
data.isna().sum()
Year 0 Season_Type 0 PLAYER_ID 0 RANK 0 PLAYER 0 TEAM_ID 0 TEAM 0 GP 0 MIN 0 FGM 0 FGA 0 FG_PCT 0 FG3M 0 FG3A 0 FG3_PCT 0 FTM 0 FTA 0 FT_PCT 0 OREB 0 DREB 0 REB 0 AST 0 STL 0 BLK 0 TOV 0 PF 0 PTS 0 EFF 0 AST_TOV 0 STL_TOV 0 dtype: int64
# Dropping unecessary data columns
data.drop (columns=['RANK','TEAM_ID','EFF'], inplace=True)
# Making the "Year" into "Starting Year"
data['season_start_year'] = data['Year'].str[:4].astype(int)
# Changing NOH to NOP for TEAM
data['TEAM'].replace(to_replace=['NOP','NOH'], value = 'NO', inplace = True)
# Cleaning the Season Type
data['Season_Type'].replace('Regular%20Season', 'RS', inplace = True)
# Creating a dataframe for Regular Season and Playoffs
rs_df = data[data['Season_Type']=='RS']
playoffs_df = data[data['Season_Type']=='Playoffs']
data.columns
Index(['Year', 'Season_Type', 'PLAYER_ID', 'PLAYER', 'TEAM', 'GP', 'MIN',
'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA',
'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF',
'PTS', 'AST_TOV', 'STL_TOV', 'season_start_year'],
dtype='object')
total_cols = ['MIN','FGM','FGA','FG3M','FG3A','FTM','FTA',
'OREB','DREB','REB','AST','STL','BLK','TOV','PF','PTS']
data
| Year | Season_Type | PLAYER_ID | PLAYER | TEAM | GP | MIN | FGM | FGA | FG_PCT | FG3M | FG3A | FG3_PCT | FTM | FTA | FT_PCT | OREB | DREB | REB | AST | STL | BLK | TOV | PF | PTS | AST_TOV | STL_TOV | season_start_year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2018-19 | RS | 201935 | James Harden | HOU | 78 | 2867 | 843 | 1909 | 0.442 | 378 | 1028 | 0.368 | 754 | 858 | 0.879 | 66 | 452 | 518 | 586 | 158 | 58 | 387 | 244 | 2818 | 1.51 | 0.41 | 2018 |
| 1 | 2018-19 | RS | 202331 | Paul George | OKC | 77 | 2841 | 707 | 1614 | 0.438 | 292 | 757 | 0.386 | 453 | 540 | 0.839 | 105 | 523 | 628 | 318 | 170 | 34 | 205 | 214 | 2159 | 1.55 | 0.83 | 2018 |
| 2 | 2018-19 | RS | 202689 | Kemba Walker | CHA | 82 | 2863 | 731 | 1684 | 0.434 | 260 | 731 | 0.356 | 380 | 450 | 0.844 | 52 | 309 | 361 | 484 | 102 | 34 | 211 | 131 | 2102 | 2.29 | 0.48 | 2018 |
| 3 | 2018-19 | RS | 203078 | Bradley Beal | WAS | 82 | 3028 | 764 | 1609 | 0.475 | 209 | 596 | 0.351 | 362 | 448 | 0.808 | 89 | 322 | 411 | 448 | 121 | 58 | 224 | 226 | 2099 | 2.00 | 0.54 | 2018 |
| 4 | 2018-19 | RS | 203081 | Damian Lillard | POR | 80 | 2838 | 681 | 1533 | 0.444 | 237 | 643 | 0.369 | 468 | 513 | 0.912 | 68 | 303 | 371 | 551 | 88 | 34 | 212 | 148 | 2067 | 2.60 | 0.41 | 2018 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3840 | 2022-23 | Playoffs | 203648 | Thanasis Antetokounmpo | MIL | 2 | 5 | 0 | 0 | 0.000 | 0 | 0 | 0.000 | 0 | 0 | 0.000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.00 | 0.00 | 2022 |
| 3841 | 2022-23 | Playoffs | 1628418 | Thomas Bryant | DEN | 1 | 1 | 0 | 0 | 0.000 | 0 | 0 | 0.000 | 0 | 0 | 0.000 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.00 | 0.00 | 2022 |
| 3842 | 2022-23 | Playoffs | 2617 | Udonis Haslem | MIA | 2 | 3 | 0 | 3 | 0.000 | 0 | 1 | 0.000 | 0 | 0 | 0.000 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0.00 | 0.00 | 2022 |
| 3843 | 2022-23 | Playoffs | 1628427 | Vlatko Cancar | DEN | 5 | 10 | 0 | 5 | 0.000 | 0 | 4 | 0.000 | 0 | 0 | 0.000 | 0 | 3 | 3 | 1 | 0 | 0 | 0 | 0 | 0 | 0.00 | 0.00 | 2022 |
| 3844 | 2022-23 | Playoffs | 1631111 | Wendell Moore Jr. | MIN | 1 | 2 | 0 | 0 | 0.000 | 0 | 0 | 0.000 | 0 | 0 | 0.000 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0.00 | 0.00 | 2022 |
3845 rows × 28 columns
# Grouping by Stats for the Reg Season
data_per_min = data.groupby(['PLAYER','PLAYER_ID','Year'])[total_cols].sum().reset_index()
for col in data_per_min.columns[4:]:
data_per_min[col] = data_per_min[col]/data_per_min['MIN']
data_per_min['FG%'] = data_per_min['FGM']/data_per_min['FGA']
data_per_min['3PT%'] = data_per_min['FG3M']/data_per_min['FG3A']
data_per_min['FT%'] = data_per_min['FTM']/data_per_min['FTA']
data_per_min['FG3A%'] = data_per_min['FG3A']/data_per_min['FGA']
data_per_min['PTS/FGA'] = data_per_min['PTS']/data_per_min['FGA']
data_per_min['FG3M/FGM'] = data_per_min['FG3M']/data_per_min['FGM']
data_per_min['FTA/FGA'] = data_per_min['FTA']/data_per_min['FGA']
data_per_min['TRU%'] = 0.5*data_per_min['PTS']/(data_per_min['FGA']+0.475*data_per_min['FTA'])
data_per_min['AST_TOV'] = data_per_min['AST']/data_per_min['TOV']
# Dropping players that played 50 or less minutes
data_per_min = data_per_min[data_per_min['MIN']>=50]
data_per_min.drop(columns='PLAYER_ID', inplace=True)
# Heat Map showing the correlation between several stats
fig = px.imshow(data_per_min.corr())
fig.show()